{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FP树\n",
    "class treeNode:\n",
    "    def __init__(self, nameValue, numOccur, parentNode):\n",
    "        self.name = nameValue\n",
    "        self.count = numOccur\n",
    "        self.nodeLink = None\n",
    "        self.parent = parentNode\n",
    "        self.children = {}\n",
    "    \n",
    "    # 增加计数\n",
    "    def inc(self, numOccur):\n",
    "        self.count += numOccur\n",
    "        \n",
    "    # 展示函数\n",
    "    def disp(self, ind=1):\n",
    "        print('  '*ind, self.name, ' ', self.count)\n",
    "        for child in self.children.values():\n",
    "            child.disp(ind + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pyramid   9\n",
      "     eye   13\n"
     ]
    }
   ],
   "source": [
    "rootNode = treeNode('pyramid', 9, None)\n",
    "rootNode.children['eye'] = treeNode('eye', 13, None)\n",
    "\n",
    "rootNode.disp()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pyramid   9\n",
      "     eye   13\n",
      "     phoenix   3\n"
     ]
    }
   ],
   "source": [
    "rootNode.children['phoenix'] = treeNode('phoenix', 3, None)\n",
    "rootNode.disp()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FP树的构建\n",
    "def createTree(dataSet, minSup=1):\n",
    "    headerTable = {}\n",
    "    for trans in dataSet:  # 第一次遍历数据集并统计频次\n",
    "        for item in trans:\n",
    "            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]\n",
    "    for k in list(headerTable.keys()):  # 少于最小支持度的删除\n",
    "        if headerTable[k] < minSup:\n",
    "            del headerTable[k]\n",
    "    freqItemSet = set(headerTable.keys())\n",
    "    if len(freqItemSet) == 0:  # 如果没有频繁集则退出\n",
    "        return None, None\n",
    "    \n",
    "    # 对头指针表进行扩展，保存计数值及指向每种类型的第一个元素项的指针\n",
    "    for k in headerTable:\n",
    "        headerTable[k] = [headerTable[k], None]\n",
    "        \n",
    "    retTree = treeNode('Null Set', 1, None)\n",
    "    for tranSet, count in dataSet.items():\n",
    "        localD = {}\n",
    "        for item in tranSet:  # 第二次遍历只考虑频繁集\n",
    "            if item in freqItemSet:\n",
    "                localD[item] = headerTable[item][0]\n",
    "        if len(localD) > 0:  # 降序排序后更新\n",
    "            orderedItems = [v[0] for v in sorted(localD.items(), \\\n",
    "                                                 key=lambda p: p[1], reverse=True)]\n",
    "            updateTree(orderedItems, retTree, headerTable, count)\n",
    "    return retTree, headerTable\n",
    "\n",
    "def updateTree(items, inTree, headerTable, count):\n",
    "    if items[0] in inTree.children:  # 如果是子节点则增加计数\n",
    "        inTree.children[items[0]].inc(count)\n",
    "    else:  # 非子节点则创建节点并更新指向\n",
    "        inTree.children[items[0]] = treeNode(items[0], count, inTree)\n",
    "        if headerTable[items[0]][1] == None:\n",
    "            headerTable[items[0]][1] = inTree.children[items[0]]\n",
    "        else:\n",
    "            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])\n",
    "            \n",
    "    if len(items) > 1:  # 存在多个项则递归更新\n",
    "        updateTree(items[1::], inTree.children[items[0]], headerTable, count)\n",
    "\n",
    "# 确保节点链接指向树中该元素项的每一个实例\n",
    "def updateHeader(nodeToTest, targetNode):\n",
    "    while nodeToTest.nodeLink != None:\n",
    "        nodeToTest = nodeToTest.nodeLink\n",
    "    nodeToTest.nodeLink = targetNode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def loadSimpDat():\n",
    "    simpDat = [['r', 'z', 'h', 'j', 'p'],\n",
    "              ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],\n",
    "              ['z'],\n",
    "              ['r', 'x', 'n', 'o', 's'],\n",
    "              ['y', 'r', 'x', 'z', 'q', 't', 'p'],\n",
    "              ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]\n",
    "    return simpDat\n",
    "\n",
    "# 将dataSet项集频次设置为1并以字典形式返回\n",
    "def createInitSet(dataSet):\n",
    "    retDict = {}\n",
    "    for trans in dataSet:\n",
    "        retDict[frozenset(trans)] = 1\n",
    "    return retDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['r', 'z', 'h', 'j', 'p'],\n",
       " ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],\n",
       " ['z'],\n",
       " ['r', 'x', 'n', 'o', 's'],\n",
       " ['y', 'r', 'x', 'z', 'q', 't', 'p'],\n",
       " ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "simpDat = loadSimpDat()\n",
    "simpDat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{frozenset({'h', 'j', 'p', 'r', 'z'}): 1,\n",
       " frozenset({'s', 't', 'u', 'v', 'w', 'x', 'y', 'z'}): 1,\n",
       " frozenset({'z'}): 1,\n",
       " frozenset({'n', 'o', 'r', 's', 'x'}): 1,\n",
       " frozenset({'p', 'q', 'r', 't', 'x', 'y', 'z'}): 1,\n",
       " frozenset({'e', 'm', 'q', 's', 't', 'x', 'y', 'z'}): 1}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "initSet = createInitSet(simpDat)\n",
    "initSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Null Set   1\n",
      "     z   5\n",
      "       r   1\n",
      "       x   3\n",
      "         s   2\n",
      "           y   2\n",
      "             t   2\n",
      "         y   1\n",
      "           t   1\n",
      "             r   1\n",
      "     x   1\n",
      "       s   1\n",
      "         r   1\n"
     ]
    }
   ],
   "source": [
    "myFPtree, myHeaderTab = createTree(initSet, 3)\n",
    "myFPtree.disp()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 创建条件模式基\n",
    "# 上溯FP树\n",
    "def ascendTree(leafNode, prefixPath):\n",
    "    if leafNode.parent != None:\n",
    "        prefixPath.append(leafNode.name)\n",
    "        ascendTree(leafNode.parent, prefixPath)\n",
    "\n",
    "# 遍历链表直到结尾\n",
    "def findPrefixPath(basePat, treeNode):\n",
    "    condPats = {}  # 条件模式基字典\n",
    "    while treeNode != None:\n",
    "        prefixPath = []\n",
    "        ascendTree(treeNode, prefixPath)\n",
    "        if len(prefixPath) > 1:\n",
    "            condPats[frozenset(prefixPath[1:])] = treeNode.count\n",
    "        treeNode = treeNode.nodeLink  # 相关节点\n",
    "    return condPats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{frozenset({'z'}): 3}"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "findPrefixPath('x', myHeaderTab['x'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "findPrefixPath('z', myHeaderTab['z'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{frozenset({'z'}): 1,\n",
       " frozenset({'s', 'x'}): 1,\n",
       " frozenset({'t', 'x', 'y', 'z'}): 1}"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "findPrefixPath('r', myHeaderTab['r'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "===================================\n",
      "('z', [5, <__main__.treeNode object at 0x000001AEF3FC28C8>])\n",
      "[5, <__main__.treeNode object at 0x000001AEF3FC28C8>]\n",
      "5\n",
      "===================================\n",
      "('r', [3, <__main__.treeNode object at 0x000001AEF3FC2E88>])\n",
      "[3, <__main__.treeNode object at 0x000001AEF3FC2E88>]\n",
      "3\n",
      "===================================\n",
      "('s', [3, <__main__.treeNode object at 0x000001AEF484C988>])\n",
      "[3, <__main__.treeNode object at 0x000001AEF484C988>]\n",
      "3\n",
      "===================================\n",
      "('y', [3, <__main__.treeNode object at 0x000001AEF484C4C8>])\n",
      "[3, <__main__.treeNode object at 0x000001AEF484C4C8>]\n",
      "3\n",
      "===================================\n",
      "('t', [3, <__main__.treeNode object at 0x000001AEF47E9BC8>])\n",
      "[3, <__main__.treeNode object at 0x000001AEF47E9BC8>]\n",
      "3\n",
      "===================================\n",
      "('x', [4, <__main__.treeNode object at 0x000001AEF3FC29C8>])\n",
      "[4, <__main__.treeNode object at 0x000001AEF3FC29C8>]\n",
      "4\n"
     ]
    }
   ],
   "source": [
    "for p in myHeaderTab.items():\n",
    "    print('===================================')\n",
    "    print(p)\n",
    "    print(p[1])\n",
    "    print(p[1][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 递归查找频繁项集\n",
    "def mineTree(inTree, headerTable, minSup, preFix, freqItemList):\n",
    "    # 按频次升序得到项集列表\n",
    "    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1][0])]\n",
    "#     print('bigL: ', bigL)  # 如：['r', 's', 'y', 't', 'x', 'z']\n",
    "    \n",
    "    for basePat in bigL:\n",
    "        newFreqSet = preFix.copy()  # 复制原频繁项集\n",
    "        newFreqSet.add(basePat)  # 添加频繁项集元素\n",
    "        freqItemList.append(newFreqSet)  # 添加频繁项集\n",
    "        \n",
    "        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])  # 构建条件基\n",
    "        myCondTree, myHead = createTree(condPattBases, minSup)  # 创建条件FP树\n",
    "        if myHead != None:\n",
    "#             print('conditional tree for: ', newFreqSet)\n",
    "#             myCondTree.disp(1)\n",
    "            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "conditional tree for:  {'s'}\n",
      "   Null Set   1\n",
      "     x   3\n",
      "conditional tree for:  {'y'}\n",
      "   Null Set   1\n",
      "     z   3\n",
      "       x   3\n",
      "conditional tree for:  {'y', 'x'}\n",
      "   Null Set   1\n",
      "     z   3\n",
      "conditional tree for:  {'t'}\n",
      "   Null Set   1\n",
      "     y   3\n",
      "       z   3\n",
      "         x   3\n",
      "conditional tree for:  {'z', 't'}\n",
      "   Null Set   1\n",
      "     y   3\n",
      "conditional tree for:  {'t', 'x'}\n",
      "   Null Set   1\n",
      "     z   3\n",
      "       y   3\n",
      "conditional tree for:  {'y', 't', 'x'}\n",
      "   Null Set   1\n",
      "     z   3\n",
      "conditional tree for:  {'x'}\n",
      "   Null Set   1\n",
      "     z   3\n"
     ]
    }
   ],
   "source": [
    "freItems = []  # 频繁项集\n",
    "mineTree(myFPtree, myHeaderTab, 3, set([]), freItems)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'r'},\n",
       " {'s'},\n",
       " {'s', 'x'},\n",
       " {'y'},\n",
       " {'y', 'z'},\n",
       " {'x', 'y'},\n",
       " {'x', 'y', 'z'},\n",
       " {'t'},\n",
       " {'t', 'y'},\n",
       " {'t', 'z'},\n",
       " {'t', 'y', 'z'},\n",
       " {'t', 'x'},\n",
       " {'t', 'x', 'z'},\n",
       " {'t', 'x', 'y'},\n",
       " {'t', 'x', 'y', 'z'},\n",
       " {'x'},\n",
       " {'x', 'z'},\n",
       " {'z'}]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freItems"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实例：从新闻网站点击流中挖掘"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['1', '2', '3'], ['1'], ['4', '5', '6', '7']]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 导入数据集\n",
    "parsedDat = [line.split() for line in open('kosarak.dat').readlines()]\n",
    "parsedDat[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 初始化数据\n",
    "initSet = createInitSet(parsedDat)\n",
    "# 构建FP树，并寻找至少被10万人浏览过的新闻报道\n",
    "myFPtree, myHeaderTab = createTree(initSet, 100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存频繁集项\n",
    "myFreqList = []\n",
    "mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看有多少新闻被10万人浏览过\n",
    "len(myFreqList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'1'},\n",
       " {'1', '6'},\n",
       " {'3'},\n",
       " {'11', '3'},\n",
       " {'11', '3', '6'},\n",
       " {'3', '6'},\n",
       " {'11'},\n",
       " {'11', '6'},\n",
       " {'6'}]"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看具体新闻标号\n",
    "myFreqList"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
